In [52]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
plt.rcParams['font.family']='Serif'
%matplotlib inline
import seaborn as sns

In [34]:
from sklearn import neighbors
from sklearn.neighbors import KernelDensity
kde = KernelDensity(0.2)

In [79]:
npr.seed(0)
kde.fit(npr.rand(100,100)*10)


Out[79]:
KernelDensity(algorithm='auto', atol=0, bandwidth=0.2, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [80]:
X = kde.sample(10000)
density = np.exp(kde.score_samples(X))

In [93]:
X_uniform = npr.rand(1000,2)
density_uniform = np.ones(len(X_uniform))

In [81]:
plt.scatter(X[:,0],X[:,1],c=density,cmap='Blues')


Out[81]:
<matplotlib.collections.PathCollection at 0x11a4c3940>

In [94]:
plt.scatter(X_uniform[:,0],X_uniform[:,1],c=density_uniform,cmap='Blues')


Out[94]:
<matplotlib.collections.PathCollection at 0x11aec08d0>

In [95]:
def local_density_k(X,k=10,metric=None):
    if metric != None:
        bt = neighbors.BallTree(X,200,metric=metric)
        neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
    else:
        neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
    distances = np.array(neighbor_graph.mean(1))[:,0]
    return 1-((distances - distances.min())/(distances.max() - distances.min()))

def local_density_r(X,r=0.1,metric=None):
    if metric != None:
        bt = neighbors.BallTree(X,200,metric=metric)
        neighbor_graph = neighbors.radius_neighbors_graph(bt,r)
    else:
        neighbor_graph = neighbors.radius_neighbors_graph(X,r)
    counts = np.array(neighbor_graph.sum(1))[:,0]
    return ((counts - counts.min())/(counts.max() - counts.min()))

In [101]:
d_k = local_density_k(X_uniform)
d_r = local_density_r(X_uniform,r=0.1)

In [111]:
plt.scatter(d_k,np.log(density_uniform)+npr.randn(len(d_k))*0.001)


Out[111]:
<matplotlib.collections.PathCollection at 0x11c1ca9e8>

In [103]:
plt.scatter(d_r,np.log(density))


Out[103]:
<matplotlib.collections.PathCollection at 0x11bb70400>

Kraskov: http://journals.aps.org/pre/pdf/10.1103/PhysRevE.69.066138

Mutual information is defined as: $$ I(X,Y) = \int \int dx dy \mu(x,y) \log \frac{\mu(x,y)}{\mu_x(x) \mu_y(y)} $$

Binned estimator: $$I(X,Y) \approx I_{binned}(X,Y) \equiv \sum_{i,j} p(i,j) \log \frac{p(i,j)}{p_x(i) p_y(j)}$$


In [ ]:


In [166]:
npr.seed(0)
X = npr.rand(1000)
def f(x,noise_size=0.1):
    return (x-0.5)**2 + npr.rand()*noise_size
Y = np.array([f(x,0.001) for x in X])

In [167]:
plt.scatter(X,Y)


Out[167]:
<matplotlib.collections.PathCollection at 0x11c6e5160>

In [186]:
def binned_MI_vec(X,Y,bins=50):
    hist = np.histogram2d(X,Y,bins=bins)[0]
    #hist*np.log(hist / np.max(1,hist[:]))
    mi_est = 0
    p_x = hist.sum(0)
    p_y = hist.sum(1)
    ind = np.outer(p_x,p_y)
    ind[ind==0]=1
    tmp = hist/ind
    tmp[tmp<1]=1
    n = len(X)
    return np.sum((hist)*np.log(tmp))
    #for i in range(len(hist)):
    #    for j in range(len(hist)):
    #        mi_est += hist[i,j] * np.log(hist[i,j]/np.max(1,(p_x[i]*p_y[j])))
    #return mi_est

In [203]:
#def binned_MI_loop(X,Y,bins=50):
bins=50
n = len(X)
hist = np.histogram2d(X,Y,bins=bins)[0] / n
#hist*np.log(hist / np.max(1,hist[:]))
mi_est = 0
p_x = hist.sum(0)
p_y = hist.sum(1)


for i in range(bins):
    for j in range(bins):
        mi_est += hist[i,j] * np.log(np.max(1,hist[i,j]/np.max(1,(p_x[i]*p_y[j]))))
#return mi_est
mi_est


Out[203]:
0.0

In [201]:
plt.imshow(hist)


Out[201]:
<matplotlib.image.AxesImage at 0x11cc07898>

In [182]:
hist = np.histogram2d(X,Y,bins=50)[0]
plt.imshow(hist)


Out[182]:
<matplotlib.image.AxesImage at 0x11c95ac50>

In [183]:
p_x = hist.sum(0)
p_y = hist.sum(1)
ind = np.outer(p_x,p_y)

In [184]:
ind = np.outer(p_x,p_y)
ind[ind==0]=1
ind.min()


Out[184]:
60.0

In [185]:
plt.imshow(ind)


Out[185]:
<matplotlib.image.AxesImage at 0x11af18eb8>

In [178]:
np.min(p_x)


Out[178]:
0.0

In [179]:
p_x = hist.sum(0)
p_x.shape,hist.shape


Out[179]:
((50,), (50, 50))

In [195]:
binned_MI_loop(X,Y)


Out[195]:
nan

In [211]:
from scipy.special import digamma

Eq. 4 from Kraskov $$H(X) \approx \frac{1}{N-1} \sum_{i=1}^{N-1} \log(x_{i+1}-x_i) + \psi(1) - \psi(N)$$


In [237]:
def estimate_entropy(X):
    X_ = np.sort(X)
    N = len(X)
    H = 0
    for i in range(N-1):
        #H += np.log(X_[i+1]-X_[i])#+digamma(1)-digamma(N)
        #H += X_[i+1]-X_[i]
        H += np.log(X_[i+1]-X_[i])+digamma(1)-digamma(N)
    return (H/(N-1))

In [237]:


In [238]:
X_ = np.sort(X)

In [239]:
plt.plot(X_)


Out[239]:
[<matplotlib.lines.Line2D at 0x11ceca668>]

In [240]:
estimate_entropy(X)


Out[240]:
-14.96252213410143

In [241]:
estimate_entropy(npr.rand(1000))


Out[241]:
-14.927801504635555

In [246]:
estimate_entropy(npr.randn(1000))


Out[246]:
-13.555714655112121

In [247]:
X_ = np.sort(X)

In [248]:
X_[1]-X_[0]


Out[248]:
0.00083738510200348504

In [255]:
def estimate_MI_knn(X,Y,k=10):
    XY = np.vstack((X,Y)).T
    bt = neighbors.BallTree(XY,k)

In [254]:
estimate_MI_knn(X,Y)


(1000, 2)

In [344]:
XY = np.vstack((X,Y)).T*10
k=100
bt = neighbors.BallTree(XY,k)
dist,ind=bt.query(XY[0],k)
dist = dist[0]
dist_to_k = dist[-1]
dist_to_k


Out[344]:
0.54548697070425201

In [349]:
XY = npr.randn(1000,2)
bt = neighbors.BallTree(XY,k)

In [350]:
dists = bt.query(XY,k)[0]
dists.shape


Out[350]:
(1000, 1)

In [342]:
dists.shape


Out[342]:
(1000, 1)

In [345]:
for k in range(1,k)[::10]:
    dists_to_k = dists[:,k]
    sns.kdeplot(dists_to_k)#,clip=(-0.1,0.6))
#plt.xlim(-0.1,0.6)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-345-0c73a9669e42> in <module>()
      1 for k in range(1,k)[::10]:
----> 2     dists_to_k = dists[:,k]
      3     sns.kdeplot(dists_to_k)#,clip=(-0.1,0.6))
      4 #plt.xlim(-0.1,0.6)

IndexError: index 1 is out of bounds for axis 1 with size 1

In [ ]:
dists_to_k